suppressPackageStartupMessages({
## Common
library(tidyverse)
library(magrittr)
library(future.apply)
library(here)
library(AnnotationHub)
library(purrr)
library(scales)
library(kableExtra)
library(tictoc)
library(ggrepel)
library(RColorBrewer)
library(ggpubr)
library(pander)
library(rmarkdown)
## Project specific
library(UpSetR)
library(SeqArray)
library(ngsReports)
})
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- availableCores() - 1
source("~/bioinformatics/bioToolkit/lbFuncs.R")
ah <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH83189"]] ## Ens101
genes <- genes(ensDb)
mcols(genes) <- mcols(genes)[
c("gene_id", "gene_name", "gene_biotype", "entrezid")
]
exons <- exonsBy(ensDb, by = "gene")
An EnsDb object for Ensemble release 101 was setup for extracting gene and exon annotation information.
metadata <- read_csv(here("files/SraRunTable.txt")) %>%
as.data.frame() %>%
dplyr::arrange(Run) %>%
mutate(
Genotype = factor(Genotype, levels = unique(Genotype)),
Run = factor(Run, levels = Run),
alias = c(
paste0(rep("WT", 8), seq(1, 8)),
paste0(rep("V1482Afs", 6), seq(1, 6)),
paste0(rep("R122Pfs", 4), seq(1, 4)),
paste0(rep("Trans", 6), seq(1, 6))
)
) %>%
dplyr::select(
sample = Run, genotype = Genotype, alias, gender, tank = Tank
)
metadata %>%
dplyr::rename(
Sample = sample, Genotype = genotype, Alias = alias, Gender = gender,
Tank = tank
) %>%
kable(
align = "l",
caption = "Sample metadata"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| Sample | Genotype | Alias | Gender | Tank |
|---|---|---|---|---|
| SRR11951228 | wild-type | WT1 | female | 1 |
| SRR11951229 | wild-type | WT2 | female | 3 |
| SRR11951230 | wild-type | WT3 | male | 2 |
| SRR11951231 | wild-type | WT4 | male | 3 |
| SRR11951232 | wild-type | WT5 | male | 1 |
| SRR11951233 | wild-type | WT6 | female | 3 |
| SRR11951234 | wild-type | WT7 | male | 2 |
| SRR11951235 | wild-type | WT8 | female | 1 |
| SRR11951236 | sorl1V1482Afs/+ | V1482Afs1 | male | 1 |
| SRR11951237 | sorl1V1482Afs/+ | V1482Afs2 | male | 2 |
| SRR11951238 | sorl1V1482Afs/+ | V1482Afs3 | male | 3 |
| SRR11951239 | sorl1V1482Afs/+ | V1482Afs4 | female | 1 |
| SRR11951240 | sorl1V1482Afs/+ | V1482Afs5 | female | 2 |
| SRR11951241 | sorl1V1482Afs/+ | V1482Afs6 | female | 3 |
| SRR11951242 | sorl1R122Pfs/+ | R122Pfs1 | male | 2 |
| SRR11951243 | sorl1R122Pfs/+ | R122Pfs2 | male | 3 |
| SRR11951244 | sorl1R122Pfs/+ | R122Pfs3 | female | 1 |
| SRR11951245 | sorl1R122Pfs/+ | R122Pfs4 | female | 1 |
| SRR11951246 | sorl1V1482Afs/R122Pfs | Trans1 | female | 1 |
| SRR11951247 | sorl1V1482Afs/R122Pfs | Trans2 | female | 2 |
| SRR11951248 | sorl1V1482Afs/R122Pfs | Trans3 | female | 3 |
| SRR11951249 | sorl1V1482Afs/R122Pfs | Trans4 | male | 2 |
| SRR11951250 | sorl1V1482Afs/R122Pfs | Trans5 | male | 1 |
| SRR11951251 | sorl1V1482Afs/R122Pfs | Trans6 | male | 3 |
genoCols <- metadata$genotype %>%
levels() %>%
length() %>%
brewer.pal("Set1") %>%
setNames(levels(metadata$genotype))
drChrs <- seq(1:25)
rawFqc <- list.files(
path = here("00_rawData/FastQC/"),
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
Library Sizes for the raw, unprocessed dataset ranged between 30,648,602 and 37,751,699 reads.
plotReadTotals(rawFqc)
plotly::ggplotly({
plotGcContent(
x = rawFqc,
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
) +
theme(legend.position = "none")
})
GC content distributions for all samples. Hover for more details.
Trimming software was setup carefully for this dataset as the intention was to perform downstream detection of short variants following the GATK workflow. Part of this process involves the removal of duplicate reads which relies on detection of duplicates by comparing sequences in the 5’ positions. Trimming can therefore be detrimental to downstream analysis. As such, the trimming procedure was performed more so as a filtering step, where reads were discarded if:
trimFqc <- list.files(
path = here("01_trim/FastQC/"),
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
dplyr::rename(Raw = Total_Sequences) %>%
left_join(readTotals(trimFqc), by = "Filename") %>%
dplyr::rename(Trimmed = Total_Sequences) %>%
mutate(
Discarded = 1 - Trimmed/Raw,
Retained = Trimmed / Raw
)
After trimming between 0.48% and 0.83% of reads were discarded.
Trimmed reads were aligned to GRCz11 reference genome (Ensembl release 101) with STAR 2.7.7a. STAR’s two-pass mode was chosen to achieve better alignments around novel splice junctions.
dupeMetrics <- list.files(here("03_markDuplicates/log/"), full.names = TRUE) %>%
lapply(function(x){
sample <- basename(x) %>%
str_remove(".tsv")
read_tsv(x, comment = "#") %>%
mutate(sample = sample)
}) %>%
bind_rows() %>%
dplyr::select(
sample,
reads = UNPAIRED_READS_EXAMINED,
secondary = SECONDARY_OR_SUPPLEMENTARY_RDS,
percent.duplication = PERCENT_DUPLICATION
)
Between 63.54% and 79.44% of reads were marked as duplicates across all samples. The representative read was chosen by random, as opposed to a base quality scoring strategy, such that reference allele mapping bias was avoided for downstream Allele Specific Expression (ASE) testing.
Variants were restricted to those that lie only in exonic regions of the Ensembl version 101 primary assembly. For zebrafish this consists of chromosomes 1 to 25. To perform this inside of GATK’s variant calling HaploypeCaller command, an interval list was created containing the exonic ranges. A number of interval list formats are supported by GATK as described here. The GATK-style .intervals is poorly described but requires the format <chr>:<start>-<end> e.g. 1:1367-1367 for a singular base position. The nomenclature must match that of the chosen reference, for example, Ensembl labels chromosome 1 as “1”, not “chr1”. The GATK-style format was chosen for its simplicity.
Defining intervals before calling variants also has the advantage of a large speed-up in computational processing time. HaplotypeCaller is the longest step in the GATK short variant discovery pipeline, sometimes taking over a day to process for large datasets on a HPC system when interval lists are not utilised.
exonRanges <- exons %>%
unlist() %>%
GenomicRanges::reduce() %>%
as.data.frame() %>%
dplyr::filter(seqnames %in% drChrs) %>%
dplyr::select(chromosome = seqnames, start, end)
intervalPath <- "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/intervals/exons.intervals"
makeIntervals <- !file.exists(intervalPath)
if (makeIntervals) {
dir.create(dirname(intervalPath), recursive = TRUE)
tibble(
interval = paste0(
exonRanges$chromosome,
":",
exonRanges$start,
"-",
exonRanges$end
)
) %>%
write.table(
file = intervalPath,
col.names = FALSE,
row.names = FALSE,
quote = FALSE
)
}
calledMetrics <- list.files(
# path = here("08_callSnvs/called/log/"),
path = "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/called/log/",
pattern = "detail",
full.names = TRUE
) %>%
lapply(read_tsv, comment = "#") %>%
bind_rows() %>%
dplyr::select(-contains(c("DBSNP", "DB_SNP", "NOVEL"))) %>%
as.data.frame()
filteredMetrics <- list.files(
# path = here("08_callSnvs/filtered/log/"),
path = "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/filtered/log/",
pattern = "detail",
full.names = TRUE
) %>%
lapply(read_tsv, comment = "#") %>%
bind_rows() %>%
dplyr::select(-contains(c("DBSNP", "DB_SNP", "NOVEL"))) %>%
as.data.frame()
selectedMetrics <- list.files(
# path = here("08_callSnvs/selected/log/"),
path = "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/selected/log/",
pattern = "detail",
full.names = TRUE
) %>%
lapply(read_tsv, comment = "#") %>%
bind_rows() %>%
dplyr::select(-contains(c("DBSNP", "DB_SNP", "NOVEL"))) %>%
as.data.frame()
filtProgress <- calledMetrics %>%
as_tibble() %>%
dplyr::select(sample = SAMPLE_ALIAS, initial = TOTAL_SNPS)
filtProgress <- selectedMetrics %>%
as_tibble() %>%
dplyr::select(sample = SAMPLE_ALIAS, biallelic = TOTAL_SNPS) %>%
left_join(filtProgress)
GATK’s initial variant calling output provides a number of different types of variants, for example indels, single nucleotide, multi nucleotide variants. The total number of all types of variants ranges between 10 462 000.00% and 20 652 400.00%.
It is recommended that only single nucleotide variants (SNVs) are used for ASE analysis. Furthermore, only biallelic SNVs are informative for estimating allelic proportions in terms of ASE. A filter was applied to such that only biallelic SNVs remained for ASE analysis. The number of biallelic SNVs ready for ASE analysis ranged between 10 544 600.00% and 8 054 300.00%.
There are a number of important quality control measures that must be taken to ensure reliable data for allele specific expression (ASE) testing. The following procedures are based around the guidelines described in Castel et al. Tools and Best Practices for allelic expression analysis.
To gather SNV information as determined by the GATK workflow, VCF files were converted into GDS objects. GDS objects allow for easy access of the data in R, but are large so were kept on the HPC system.
vcfPaths <- list.files(
"/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/selected",
pattern = ".vcf.gz$",
full.names = TRUE
)
lapply(vcfPaths, function(vcf){
gdsPath <- paste0(
str_remove(dirname(vcf), "selected"),
"gds/",
str_remove(basename(vcf), ".vcf.gz"),
".gds"
)
if (!file.exists(gdsPath)) {
if (!dir.exists(dirname(gdsPath))) {
dir.create(dirname(gdsPath), recursive = TRUE)
}
seqVCF2GDS(vcf, gdsPath)
}
})
gdsPaths <- list.files(
"/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/gds",
pattern = ".gds$",
full.names = TRUE
)
snvList <- lapply(seq_along(gdsPaths), function(x){
gdsPath <- gdsPaths[x]
sample <- basename(gdsPath) %>%
str_remove(".gds")
gds <- seqOpen(gdsPath, readonly = FALSE)
snvs <- tibble(
variant.id = seqGetData(gds, "variant.id"),
chromosome = seqGetData(gds, "chromosome"),
position = seqGetData(gds, "position"),
allele = seqGetData(gds, "allele"),
filter = seqGetData(gds, "annotation/filter")
) %>%
# dplyr::filter(
# filter == "PASS"
# ) %>%
mutate(
sample = sample,
refAllele = str_split(allele, ",", simplify = TRUE)[,1],
altAllele = str_split(allele, ",", simplify = TRUE)[,2]
)
seqClose(gds)
return(snvs)
})
An important aspect to consider in ASE analysis is the potential for read mapping bias. For RNA-seq data mapped to a reference genome, reads that carry an alternate allele at positions of variation have at least one mismatch, and therefore a lower probability of aligning correctly than reads containing the reference allele.
WASP provides a suite of tools for unbiased allele-specific read mapping. The WASP workflow for mappability filtering is defined by 5 steps:
STAR was chosen in this case).find_intersecting_snps.py script. Each overlapping read is output as a set of synthetic reads in FASTQ format with its allele swapped to to the other allele at the SNV site.filter_remapped_reads.py script.samtools, resulting in a complete set of mappability-filtered aligned reads in BAM format.To begin the WASP fltering procedure, input files containing genetic variants (SNVs) previously identified by the GATK workflow are needed. Since we do not have phasing information, it is recommended to use text-based format opposed to HDF5 for the input files. The text-based input files require three space-delimited columns (position, ref_allele, alt_allele), and one input file per chromosome. The filenames must follow the convention <chr>.snps.txt.gz (e.g 1.snps.txt.gz for chromosome 1).
lapply(snvList, function(x){
chrList <- x %>%
dplyr::select(chromosome, position, refAllele, altAllele, sample) %>%
split(.[,'chromosome'])
lapply(chrList, function(x){
chr <- unique(x$chromosome)
sample <- unique(x$sample)
path <- paste0(
"/hpcfs/users/a1647910/210216_sorl1_snv/09_wasp/snvs/",
sample,
"/",
chr,
".snps.txt.gz"
)
if (!file.exists(path)) {
if (!dir.exists(dirname(path))) {
dir.create(dirname(path), recursive = TRUE)
}
tibble(
position = x$position,
refAllele = x$refAllele,
altAllele = x$altAllele
) %>%
write_delim(file = path, delim = " ", col_names = FALSE)
}
})
})
Text-based input files were created for WASP filtering, which was subsequently performed on the HPC system. The resulting BAM files were then subject to allele specific expression read counting with ASEReadCounter.
GATK tool ASEReadCounter calculates allele counts at a set of positions after applying filters specifically tuned for ASE analysis of RNA-seq data. The filters operate on mapping quality, base quality, depth of coverage and overlapping paired reads. Each of these filters can be controlled by command-line arguments. The data in this analysis was generated with the following options:
--min-mapping-quality 10--min-base-quality 20--count-overlap-reads-handling COUNT_FRAGMENTS_REQUIRE_SAME_BASE--min-depth-of-non-filtered-base -1ASEReadCounter was run on BAM alignment files that were subject to WASP filtering and also alignment files that had not been WASP filtered, to compare whether the filtering procedure improves any observable reference allele mapping bias. Heterozygous SNVs detected by ASEReadCounter were recorded in separate tsv files for each sample. These were loaded into R as a list of tibbles containing allele count data for ASE analysis.
files_nowasp <- list.files(
"/hpcfs/users/a1647910/210216_sorl1_snv/10_aseReadCounter/nowasp",
full.names = TRUE
)
samples <- basename(files_nowasp) %>%
str_remove(".tsv")
aseRC_nowasp <- lapply(files_nowasp, function(file){
sample <- basename(file) %>%
str_remove(".tsv")
read_tsv(
file,
col_types = "cdcccdddddddd"
) %>%
mutate(
sample = sample,
allele = paste0(refAllele, ",", altAllele)
) %>%
left_join(metadata[,c("sample", "genotype")]) %>%
dplyr::select(
chromosome = contig, position, allele, refCount, altCount, totalCount,
sample, genotype, lowMAPQDepth, lowBaseQDepth, rawDepth, otherBases,
improperPairs, refAllele, altAllele
)
}) %>%
set_names(samples)
files_wasp <- list.files(
"/hpcfs/users/a1647910/210216_sorl1_snv/10_aseReadCounter/wasp",
full.names = TRUE
)
aseRC_wasp <- lapply(files_wasp, function(file){
sample <- basename(file) %>%
str_remove(".tsv")
read_tsv(
file,
col_types = "cdcccdddddddd"
) %>%
mutate(
sample = sample,
allele = paste0(refAllele, ",", altAllele)
) %>%
left_join(metadata[,c("sample", "genotype")]) %>%
dplyr::select(
chromosome = contig, position, allele, refCount, altCount, totalCount,
sample, genotype, lowMAPQDepth, lowBaseQDepth, rawDepth, otherBases,
improperPairs, refAllele, altAllele
)
}) %>%
set_names(samples)
filtProgress <- list(
nowasp = aseRC_nowasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(
aseRC = n()
) %>%
left_join(filtProgress),
wasp = aseRC_wasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(
software = n()
) %>%
left_join(filtProgress)
)
SNV sites with low read coverage are uninformative, and therefore were subject to filtering. To determine the optimal read coverage value for filtering, the effects of potential cut-offs were inspected.
covBins_nowasp <- lapply(aseRC_nowasp, function(x){
n0 <- x %>%
dplyr::filter(totalCount >= 0) %>%
nrow()
n10 <- x %>%
dplyr::filter(totalCount >= 10) %>%
nrow()
n20 <- x %>%
dplyr::filter(totalCount >= 20) %>%
nrow()
n30 <- x %>%
dplyr::filter(totalCount >= 30) %>%
nrow()
n60 <- x %>%
dplyr::filter(totalCount >= 60) %>%
nrow()
sample <- unique(x$sample)
tibble(
bin = c("\u2265 0", "\u2265 10", "\u2265 20", "\u2265 30", "\u2265 60"),
n = c(n0, n10, n20, n30, n60),
sample = sample,
wasp = "Without WASP filtering"
)
}) %>%
bind_rows()
covBins_wasp <- lapply(aseRC_wasp, function(x){
n0 <- x %>%
dplyr::filter(totalCount >= 0) %>%
nrow()
n10 <- x %>%
dplyr::filter(totalCount >= 10) %>%
nrow()
n20 <- x %>%
dplyr::filter(totalCount >= 20) %>%
nrow()
n30 <- x %>%
dplyr::filter(totalCount >= 30) %>%
nrow()
n60 <- x %>%
dplyr::filter(totalCount >= 60) %>%
nrow()
sample <- unique(x$sample)
tibble(
bin = c("\u2265 0", "\u2265 10", "\u2265 20", "\u2265 30", "\u2265 60"),
n = c(n0, n10, n20, n30, n60),
sample = sample,
wasp = "With WASP filtering"
)
}) %>%
bind_rows()
bind_rows(covBins_nowasp, covBins_wasp) %>%
mutate(wasp = factor(wasp, levels = c(
"Without WASP filtering", "With WASP filtering"
))) %>%
ggplot(aes(bin, n, fill = bin)) +
geom_boxplot() +
labs(x = "Reads / SNV", y = "Number of SNVs") +
theme(legend.position = "none") +
facet_wrap(~wasp)
The number of SNVs per sample that satisfy read coverage cut-offs
A cut-off of at least 10 counts per SNV position was chosen as it provides a reasonable estimate of ASE but does not remove too many SNVs.
aseRC_wasp <- lapply(aseRC_wasp, function(x){
dplyr::filter(x, totalCount >= 10)
})
aseRC_nowasp <- lapply(aseRC_nowasp, function(x){
dplyr::filter(x, totalCount >= 10)
})
filtProgress <- list(
nowasp = aseRC_nowasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(
coverage = n()
) %>%
left_join(filtProgress$nowasp),
wasp = aseRC_wasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(
coverage = n()
) %>%
left_join(filtProgress$wasp)
)
Producing evidence of mono-allelic expression from short read RNA-seq datasets without parental genotypes or imprinting information is a controversial issue. SNV sites were therefore filtered based on a depth criteria for each allele (\(\ge\) 3 counts or \(>\) 1% of the total counts for that allele).
aseRC_wasp <- lapply(aseRC_wasp, function(x){
dplyr::filter(
x,
refCount >= 3,
altCount >= 3,
refCount / totalCount > 0.01,
altCount / totalCount > 0.01
)
})
aseRC_nowasp <- lapply(aseRC_nowasp, function(x){
dplyr::filter(
x,
refCount >= 3,
altCount >= 3,
refCount / totalCount > 0.01,
altCount / totalCount > 0.01
)
})
filtProgress <- list(
nowasp = aseRC_nowasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(
monoallelic = n()
) %>%
left_join(filtProgress$nowasp),
wasp = aseRC_wasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(
monoallelic = n()
) %>%
left_join(filtProgress$wasp)
)
WASP filtering is designed to remove the vast majority of reads with mapping bias. Reference allele ratios were calculated for all SNVs passing filtering procedures with and without WASP filtering, to determine the effectiveness of WASP. A residual mapping bias is indicated by a large deviation in reference allele ratios from 0.5.
refRatios_nowasp <- lapply(aseRC_nowasp, function(x){
x %>%
dplyr::filter(totalCount != 0) %>%
mutate(
refRatio = refCount / totalCount,
altRatio = altCount / totalCount
) %>%
dplyr::select(
chromosome, position, allele, refRatio, altRatio, totalCount,
sample, genotype
)
}) %>%
bind_rows()
refRatios_wasp <- lapply(aseRC_wasp, function(x){
x %>%
dplyr::filter(totalCount != 0) %>%
mutate(
refRatio = refCount / totalCount,
altRatio = altCount / totalCount
) %>%
dplyr::select(
chromosome, position, allele, refRatio, altRatio, totalCount,
sample, genotype
)
}) %>%
bind_rows()
refRatios_nowasp %>%
left_join(metadata[,c("sample", "genotype")]) %>%
dplyr::arrange(genotype, sample) %>%
ggplot(aes(refRatio, group = sample, colour = genotype)) +
geom_density() +
geom_vline(xintercept = 0.5, linetype = "dashed") +
labs(x = "Reference allele ratio", y = "Density", colour = "Genotype") +
theme(legend.position = "bottom") +
scale_color_manual(values = genoCols) +
ggtitle("Reference allele ratio distributions without WASP filtering")
refRatios_wasp %>%
left_join(metadata[,c("sample", "genotype")]) %>%
dplyr::arrange(genotype, sample) %>%
ggplot(aes(refRatio, group = sample, colour = genotype)) +
geom_density() +
geom_vline(xintercept = 0.5, linetype = "dashed") +
labs(x = "Reference allele ratio", y = "Density", colour = "Genotype") +
theme(legend.position = "bottom") +
scale_color_manual(values = genoCols) +
ggtitle("Reference allele ratio distributions with WASP filtering")
As an alternative viewpoint, boxplots of reference allele ratios were plotted for each sample. Median/mean reference allele ratios that deviate substantially from 0.5 indicate a mapping bias.
refRatios_nowasp %>%
left_join(metadata[,c("sample", "genotype")]) %>%
dplyr::arrange(genotype, sample) %>%
mutate(sample = factor(sample, levels = unique(sample))) %>%
ggplot(aes(sample, refRatio, fill = genotype)) +
geom_boxplot(fatten = 3, outlier.shape = NA) +
geom_hline(yintercept = 0.5, colour = "black") +
stat_summary(fun = mean, geom = "point", colour = "white") +
labs(x = "Sample", y = "Reference allele ratio") +
scale_fill_discrete(name = "Genotype") +
scale_y_continuous(breaks = seq(0, 1, 0.1)) +
guides(colour = FALSE) +
theme(
axis.text.x = element_text(angle = -90, vjust = 0.5),
legend.position = "bottom"
) +
ggtitle(label = "Reference allele ratios without WASP filtering")
Boxplots showing mapping bias for each sample. The mean reference ratio is indicated with a black diamond. Filtering procedures successfully were confirmed to have removed any potential mapping bias.
refRatios_wasp %>%
dplyr::filter(totalCount >= 10) %>%
left_join(metadata[,c("sample", "genotype")]) %>%
dplyr::arrange(genotype, sample) %>%
mutate(sample = factor(sample, levels = unique(sample))) %>%
ggplot(aes(sample, refRatio, fill = genotype)) +
geom_boxplot(fatten = 3, outlier.shape = NA) +
geom_hline(yintercept = 0.5, colour = "black") +
stat_summary(fun = mean, geom = "point", colour = "white") +
labs(x = "Sample", y = "Reference allele ratio") +
scale_fill_discrete(name = "Genotype") +
scale_y_continuous(breaks = seq(0, 1, 0.1)) +
guides(colour = FALSE) +
theme(
axis.text.x = element_text(angle = -90, vjust = 0.5),
legend.position = "bottom"
) +
ggtitle(label = "Reference allele ratios with WASP filtering")
Boxplots showing mapping bias for each sample. The mean reference ratio is indicated with a black diamond. Filtering procedures successfully were confirmed to have removed any potential mapping bias.
While not perfect, WASP filtering improved the reference allele mapping bias and therefore was WASP filtered data chosen for ASE testing.
refBias <- refRatios_wasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(mean = mean(refRatio), median = median(refRatio))
altBias <- refRatios_wasp %>%
bind_rows() %>%
group_by(sample) %>%
summarise(mean = mean(altRatio), median = median(altRatio))
After quality control procedures, mean reference ratios ranged between 0.514 and 0.519. Median reference ratios ranged between 0.5 and 0.515.
filtProgress$wasp %>%
dplyr::select(
sample, None = initial, `Biallelic SNVs` = biallelic,
`WASP/ASEReadCounter` = software,
Coverage = coverage, `Mono-allelic expression` = monoallelic
) %>%
pivot_longer(cols = -sample, names_to = "Filter", values_to = "SNVs") %>%
mutate(Filter = factor(Filter, levels = unique(.$Filter))) %>%
ggplot(aes(Filter, SNVs, group = Filter, fill = Filter)) +
geom_boxplot() +
# scale_y_log10(
# labels = comma,
# breaks = c(seq(0, 500000, by = 100000), seq(750000, 1500000, by = 250000))
# ) +
labs(y = "Number of SNVs") +
theme(
legend.position = "none",
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 315, hjust = 0),
axis.ticks.x = element_blank()
)
Boxplots of the total number of SNVs remaining in each sample after various quality control filtering steps. The filters were applied sequentially from left to right across the x-axis, such that the remaining number of SNVs continually decreases.
# genesWithSnvs <- lapply(aseRC, function(x){
# sample <- unique(x$sample)
# atLeast1 <- length(unique(x$gene))
# atLeast2 <- x %>%
# group_by(gene) %>%
# summarise(n = n()) %>%
# dplyr::filter(n >= 2) %>%
# nrow()
# snvs <- nrow(x)
# tibble(sample = sample, atLeast1 = atLeast1, atLeast2 = atLeast2, snvs = snvs)
# }) %>%
# bind_rows() %>%
# left_join(metadata[,c("sample", "genotype")]) %>%
# dplyr::arrange(genotype, sample)
# genesWithSnvs %>%
# dplyr::select(
# Sample = sample, Genotype = genotype, Genes = atLeast1, SNVs = snvs
# ) %>%
# kable(
# align = "l",
# caption = paste0(
# "A summary of the remaining number of genes and SNVs detected in each ",
# "sample after all quality control procedures were performed."
# )
# ) %>%
# kable_styling(
# bootstrap_options = c("striped", "hover", "condensed", "responsive")
# )
For static ASE testing, which describes the difference between two variants of a heterozygous allele in a single sample in an unchanging condition, geneiASE software was chosen. geneiASE requires allele counts tsv format with four columns: featureID, snpID, alternative allele count, reference allele count.
# staticCounts <- lapply(aseRC, function(x){
# x %>%
# left_join(altBias[,c("sample", "mean")]) %>%
# dplyr::select(
# gene, snvID, altCount, refCount, betabinom.p = mean
# )
# })
# lapply(seq_along(staticCounts), function(x){
# path <- paste0(
# "/hpcfs/users/a1647910/210216_sorl1_snv/11_geneiase/counts/",
# names(staticCounts[x]),
# ".static.tsv"
# )
# if (!file.exists(path)) {
# if (!dir.exists(dirname(path))) {
# dir.create(dirname(path), recursive = TRUE)
# }
# write_tsv(staticCounts[[x]], path)
# }
# })
sessionInfo() %>%
pander()
R version 4.1.0 (2021-05-18)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ensembldb(v.2.16.2), AnnotationFilter(v.1.16.0), GenomicFeatures(v.1.44.0), AnnotationDbi(v.1.54.1), Biobase(v.2.52.0), GenomicRanges(v.1.44.0), GenomeInfoDb(v.1.28.1), IRanges(v.2.26.0), S4Vectors(v.0.30.0), ngsReports(v.1.8.1), SeqArray(v.1.32.0), gdsfmt(v.1.28.0), UpSetR(v.1.4.0), rmarkdown(v.2.9), pander(v.0.6.4), ggpubr(v.0.4.0), RColorBrewer(v.1.1-2), ggrepel(v.0.9.1), tictoc(v.1.0.1), kableExtra(v.1.3.4), scales(v.1.1.1), AnnotationHub(v.3.0.1), BiocFileCache(v.2.0.0), dbplyr(v.2.1.1), BiocGenerics(v.0.38.0), here(v.1.0.1), future.apply(v.1.7.0), future(v.1.21.0), magrittr(v.2.0.1), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.7), purrr(v.0.3.4), readr(v.1.4.0), tidyr(v.1.1.3), tibble(v.3.1.2), ggplot2(v.3.3.5) and tidyverse(v.1.3.1)
loaded via a namespace (and not attached): readxl(v.1.3.1), backports(v.1.2.1), systemfonts(v.1.0.2), plyr(v.1.8.6), lazyeval(v.0.2.2), crosstalk(v.1.1.1), BiocParallel(v.1.26.1), listenv(v.0.8.0), digest(v.0.6.27), htmltools(v.0.5.1.1), fansi(v.0.5.0), memoise(v.2.0.0), openxlsx(v.4.2.4), globals(v.0.14.0), Biostrings(v.2.60.1), modelr(v.0.1.8), matrixStats(v.0.59.0), svglite(v.2.0.0), prettyunits(v.1.1.1), colorspace(v.2.0-2), blob(v.1.2.1), rvest(v.1.0.0), rappdirs(v.0.3.3), haven(v.2.4.1), xfun(v.0.24), crayon(v.1.4.1), RCurl(v.1.98-1.3), jsonlite(v.1.7.2), zoo(v.1.8-9), glue(v.1.4.2), gtable(v.0.3.0), zlibbioc(v.1.38.0), XVector(v.0.32.0), webshot(v.0.5.2), DelayedArray(v.0.18.0), car(v.3.0-11), abind(v.1.4-5), DBI(v.1.1.1), rstatix(v.0.7.0), Rcpp(v.1.0.7), progress(v.1.2.2), viridisLite(v.0.4.0), xtable(v.1.8-4), foreign(v.0.8-81), bit(v.4.0.4), DT(v.0.18), htmlwidgets(v.1.5.3), httr(v.1.4.2), ellipsis(v.0.3.2), farver(v.2.1.0), XML(v.3.99-0.6), pkgconfig(v.2.0.3), sass(v.0.4.0), utf8(v.1.2.1), labeling(v.0.4.2), tidyselect(v.1.1.1), rlang(v.0.4.11), later(v.1.2.0), munsell(v.0.5.0), BiocVersion(v.3.13.1), cellranger(v.1.1.0), tools(v.4.1.0), cachem(v.1.0.5), cli(v.3.0.0), generics(v.0.1.0), RSQLite(v.2.2.7), broom(v.0.7.8), evaluate(v.0.14), fastmap(v.1.1.0), ggdendro(v.0.1.22), yaml(v.2.2.1), knitr(v.1.33), bit64(v.4.0.5), fs(v.1.5.0), zip(v.2.2.0), KEGGREST(v.1.32.0), mime(v.0.11), xml2(v.1.3.2), biomaRt(v.2.48.2), compiler(v.4.1.0), rstudioapi(v.0.13), plotly(v.4.9.4.1), filelock(v.1.0.2), curl(v.4.3.2), png(v.0.1-7), interactiveDisplayBase(v.1.30.0), ggsignif(v.0.6.2), reprex(v.2.0.0), bslib(v.0.2.5.1), stringi(v.1.6.2), highr(v.0.9), lattice(v.0.20-44), ProtGenerics(v.1.24.0), Matrix(v.1.3-4), vctrs(v.0.3.8), pillar(v.1.6.1), lifecycle(v.1.0.0), BiocManager(v.1.30.16), jquerylib(v.0.1.4), data.table(v.1.14.0), bitops(v.1.0-7), rtracklayer(v.1.52.0), httpuv(v.1.6.1), BiocIO(v.1.2.0), R6(v.2.5.0), promises(v.1.2.0.1), gridExtra(v.2.3), rio(v.0.5.27), parallelly(v.1.26.1), codetools(v.0.2-18), MASS(v.7.3-54), assertthat(v.0.2.1), SummarizedExperiment(v.1.22.0), rjson(v.0.2.20), rprojroot(v.2.0.2), withr(v.2.4.2), GenomicAlignments(v.1.28.0), Rsamtools(v.2.8.0), GenomeInfoDbData(v.1.2.6), hms(v.1.1.0), grid(v.4.1.0), MatrixGenerics(v.1.4.0), carData(v.3.0-4), shiny(v.1.6.0), lubridate(v.1.7.10) and restfulr(v.0.0.13)